In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings 
filterwarnings("ignore")
import plotly.express as px

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
D:\anaconda files\lib\site-packages\scipy\__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.26.4
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
In [2]:
data = ("C:\\Users\\laxma\\Downloads\\netflix.csv")
data = pd.read_csv(data)
In [3]:
data.head()
Out[3]:
title genre language imdb_score premiere runtime year
0 Notes for My Son Drama Spanish 6.3 11/24/2020 83 2020
1 To Each, Her Own Romantic comedy French 5.3 6/24/2018 95 2018
2 The Lovebirds Romantic comedy English 6.1 5/22/2020 87 2020
3 The Perfection Horror-thriller English 6.1 5/24/2019 90 2019
4 Happy Anniversary Romantic comedy English 5.8 3/30/2018 78 2018
In [4]:
data.shape
Out[4]:
(583, 7)
In [5]:
col_names = ['Name','classification','language type','rating','Relase date','Duration','year']

data.columns = col_names
col_names
Out[5]:
['Name',
 'classification',
 'language type',
 'rating',
 'Relase date',
 'Duration',
 'year']
In [6]:
data.head()
Out[6]:
Name classification language type rating Relase date Duration year
0 Notes for My Son Drama Spanish 6.3 11/24/2020 83 2020
1 To Each, Her Own Romantic comedy French 5.3 6/24/2018 95 2018
2 The Lovebirds Romantic comedy English 6.1 5/22/2020 87 2020
3 The Perfection Horror-thriller English 6.1 5/24/2019 90 2019
4 Happy Anniversary Romantic comedy English 5.8 3/30/2018 78 2018
In [7]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            583 non-null    object 
 1   classification  583 non-null    object 
 2   language type   583 non-null    object 
 3   rating          583 non-null    float64
 4   Relase date     583 non-null    object 
 5   Duration        583 non-null    int64  
 6   year            583 non-null    int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 32.0+ KB
In [8]:
col_names = ['Name','classification','language type','rating','Relase date','Duration','year']
for col in col_names:
    print(data[col].value_counts())
Notes for My Son                           1
ReMastered: The Miami Showband Massacre    1
The Day of the Lord                        1
Milestone                                  1
Sardar Ka Grandson                         1
                                          ..
To All the Boys I've Loved Before          1
Tribhanga ? Tedhi Medhi Crazy              1
Team Foxcatcher                            1
The Players                                1
Biggie: I Got a Story to Tell              1
Name: Name, Length: 583, dtype: int64
Documentary                    159
Drama                           77
Comedy                          49
Romantic comedy                 39
Thriller                        33
                              ... 
Political thriller               1
Fantasy                          1
Romantic comedy-drama            1
Animation/Musical/Adventure      1
Supernatural drama               1
Name: classification, Length: 114, dtype: int64
English                       401
Hindi                          32
Spanish                        31
French                         20
Italian                        14
Portuguese                     12
Indonesian                      9
Korean                          6
Japanese                        6
English/Spanish                 5
German                          5
Turkish                         5
Polish                          3
Dutch                           3
Marathi                         3
Filipino                        2
Thai                            2
English/Japanese                2
English/Hindi                   2
English/Mandarin                2
English/Korean                  1
Khmer/English/French            1
English/Akan                    1
Bengali                         1
English/Swedish                 1
English/Arabic                  1
English/Taiwanese/Mandarin      1
Norwegian                       1
Tamil                           1
English/Ukranian/Russian        1
Spanish/Catalan                 1
English/Russian                 1
Georgian                        1
Spanish/English                 1
Swedish                         1
Malay                           1
Thia/English                    1
Spanish/Basque                  1
Name: language type, dtype: int64
6.3    30
5.8    30
6.4    28
7.1    28
6.5    26
6.7    25
6.1    24
6.8    24
7.3    21
7.2    20
5.7    20
5.2    19
5.5    19
7.0    19
6.9    19
6.6    18
6.2    18
5.9    16
5.6    15
5.4    13
6.0    13
7.4    12
7.5    10
7.6    10
5.3    10
4.6     8
7.7     8
4.8     7
4.7     6
4.4     6
5.1     6
5.0     5
8.2     5
4.1     4
4.9     4
7.9     4
4.5     4
8.1     3
4.3     3
7.8     3
8.4     3
3.7     2
8.3     2
4.2     2
2.6     2
9.0     1
8.0     1
3.2     1
3.9     1
3.5     1
8.6     1
3.4     1
8.5     1
2.5     1
Name: rating, dtype: int64
10/2/2020     6
11/1/2019     5
10/18/2019    5
12/7/2018     4
1/15/2021     4
             ..
12/3/2020     1
8/2/2019      1
6/16/2017     1
10/13/2020    1
3/1/2021      1
Name: Relase date, Length: 386, dtype: int64
97     24
98     19
94     19
95     18
100    17
       ..
45      1
25      1
54      1
51      1
13      1
Name: Duration, Length: 124, dtype: int64
2020    182
2019    125
2018     99
2021     71
2017     66
2016     30
2015      9
2014      1
Name: year, dtype: int64
In [9]:
data['year'].value_counts()
Out[9]:
2020    182
2019    125
2018     99
2021     71
2017     66
2016     30
2015      9
2014      1
Name: year, dtype: int64
In [10]:
data.isnull().sum()
Out[10]:
Name              0
classification    0
language type     0
rating            0
Relase date       0
Duration          0
year              0
dtype: int64
In [11]:
data.columns
Out[11]:
Index(['Name', 'classification', 'language type', 'rating', 'Relase date',
       'Duration', 'year'],
      dtype='object')
In [12]:
#VISUALIZATION
In [13]:
plt.figure(figsize=(10,4))
sns.countplot(x='rating', data=data, color='cyan')
plt.xticks(rotation=90)
plt.show()
In [14]:
plt.bar(data['rating'],data['year'])
plt.xticks(rotation=90)
plt.show()
In [15]:
fig=px.violin(data,x='Name',y='classification',color='Name')
fig.show()
In [16]:
fig=px.bar(data,x='Duration',y='language type',color='language type')
fig.show()
In [17]:
sns.barplot(data['year'],data['language type'],color='r')
plt.xticks(rotation=90)
plt.show()
In [18]:
sns.lineplot(x='Duration', y='rating', data=data).set_title('')
Out[18]:
Text(0.5, 1.0, '')
In [19]:
sns.displot(data["language type"])
plt.xticks(rotation=90)
Out[19]:
([0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37],
 [Text(0, 0, 'Spanish'),
  Text(1, 0, 'French'),
  Text(2, 0, 'English'),
  Text(3, 0, 'Portuguese'),
  Text(4, 0, 'English/Mandarin'),
  Text(5, 0, 'English/Spanish'),
  Text(6, 0, 'German'),
  Text(7, 0, 'Italian'),
  Text(8, 0, 'Korean'),
  Text(9, 0, 'Thia/English'),
  Text(10, 0, 'Hindi'),
  Text(11, 0, 'Malay'),
  Text(12, 0, 'Japanese'),
  Text(13, 0, 'Marathi'),
  Text(14, 0, 'Swedish'),
  Text(15, 0, 'Indonesian'),
  Text(16, 0, 'Dutch'),
  Text(17, 0, 'Filipino'),
  Text(18, 0, 'Spanish/English'),
  Text(19, 0, 'English/Taiwanese/Mandarin'),
  Text(20, 0, 'Georgian'),
  Text(21, 0, 'English/Hindi'),
  Text(22, 0, 'English/Russian'),
  Text(23, 0, 'Spanish/Catalan'),
  Text(24, 0, 'English/Ukranian/Russian'),
  Text(25, 0, 'Tamil'),
  Text(26, 0, 'Norwegian'),
  Text(27, 0, 'Turkish'),
  Text(28, 0, 'English/Arabic'),
  Text(29, 0, 'Polish'),
  Text(30, 0, 'English/Swedish'),
  Text(31, 0, 'Bengali'),
  Text(32, 0, 'English/Japanese'),
  Text(33, 0, 'Thai'),
  Text(34, 0, 'English/Korean'),
  Text(35, 0, 'Khmer/English/French'),
  Text(36, 0, 'English/Akan'),
  Text(37, 0, 'Spanish/Basque')])
In [20]:
#MODEL BUILDING
In [21]:
X = data.drop(['year'], axis = 1)
y = data['year']
In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)
In [23]:
X_train.shape, X_test.shape
Out[23]:
((390, 6), (193, 6))
In [24]:
X_train.dtypes
Out[24]:
Name               object
classification     object
language type      object
rating            float64
Relase date        object
Duration            int64
dtype: object
In [25]:
X_train.head()
Out[25]:
Name classification language type rating Relase date Duration
552 American Son Drama English 5.8 11/1/2019 90
280 Beats Drama English 7.1 6/19/2019 110
234 Nappily Ever After Comedy-drama English 6.4 9/21/2018 98
255 Eurovision Song Contest: The Story of Fire Saga Musical comedy English 6.5 6/26/2020 123
438 A Christmas Prince: The Royal Wedding Romantic comedy English 5.3 11/30/2018 92
In [26]:
pip install category_encoders
Requirement already satisfied: category_encoders in d:\anaconda files\lib\site-packages (2.6.3)
Requirement already satisfied: numpy>=1.14.0 in d:\anaconda files\lib\site-packages (from category_encoders) (1.26.4)
Requirement already satisfied: scipy>=1.0.0 in d:\anaconda files\lib\site-packages (from category_encoders) (1.9.1)
Requirement already satisfied: pandas>=1.0.5 in d:\anaconda files\lib\site-packages (from category_encoders) (1.4.4)
Requirement already satisfied: statsmodels>=0.9.0 in d:\anaconda files\lib\site-packages (from category_encoders) (0.13.2)
Requirement already satisfied: scikit-learn>=0.20.0 in d:\anaconda files\lib\site-packages (from category_encoders) (1.0.2)
Requirement already satisfied: patsy>=0.5.1 in d:\anaconda files\lib\site-packages (from category_encoders) (0.5.2)
Requirement already satisfied: python-dateutil>=2.8.1 in d:\anaconda files\lib\site-packages (from pandas>=1.0.5->category_encoders) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in d:\anaconda files\lib\site-packages (from pandas>=1.0.5->category_encoders) (2022.1)
Requirement already satisfied: six in d:\anaconda files\lib\site-packages (from patsy>=0.5.1->category_encoders) (1.16.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in d:\anaconda files\lib\site-packages (from scikit-learn>=0.20.0->category_encoders) (2.2.0)
Requirement already satisfied: joblib>=0.11 in d:\anaconda files\lib\site-packages (from scikit-learn>=0.20.0->category_encoders) (1.1.0)
Collecting numpy>=1.14.0
  Using cached numpy-1.24.4-cp39-cp39-win_amd64.whl (14.9 MB)
Requirement already satisfied: packaging>=21.3 in d:\anaconda files\lib\site-packages (from statsmodels>=0.9.0->category_encoders) (21.3)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in d:\anaconda files\lib\site-packages (from packaging>=21.3->statsmodels>=0.9.0->category_encoders) (3.0.9)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
Note: you may need to restart the kernel to use updated packages.
ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'D:\\anaconda files\\Lib\\site-packages\\~3mpy.libs\\libopenblas64__v0.3.23-293-gc2f4bdbb-gcc_10_3_0-2bde3a66a51006b2b53eb373ff767a3f.dll'
Consider using the `--user` option or check the permissions.

In [27]:
import category_encoders as ce
In [28]:
encoder = ce.OrdinalEncoder(cols=['Name','classification','language type','rating','Relase date','Duration'])

X_train = encoder.fit_transform(X_train)

X_test = encoder.transform(X_test)
In [29]:
X_train.head()
Out[29]:
Name classification language type rating Relase date Duration
552 1 1 1 1 1 1
280 2 1 1 2 2 2
234 3 2 1 3 3 3
255 4 3 1 4 4 4
438 5 4 1 5 5 5
In [30]:
X_test.head()
Out[30]:
Name classification language type rating Relase date Duration
355 -1.0 1.0 1.0 20.0 -1.0 40.0
407 -1.0 18.0 1.0 42.0 -1.0 32.0
90 -1.0 5.0 1.0 2.0 -1.0 20.0
402 -1.0 44.0 1.0 28.0 34.0 61.0
268 -1.0 5.0 17.0 -1.0 -1.0 32.0
In [31]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(random_state=60)
rfc.fit(X_train,y_train)
Out[31]:
RandomForestClassifier(random_state=60)
In [32]:
y_pred=rfc.predict(X_test)
y_pred
Out[32]:
array([2019, 2019, 2019, 2020, 2019, 2019, 2019, 2019, 2019, 2019, 2018,
       2017, 2019, 2019, 2019, 2019, 2019, 2020, 2019, 2018, 2019, 2019,
       2019, 2019, 2019, 2019, 2019, 2019, 2018, 2019, 2019, 2019, 2020,
       2019, 2020, 2019, 2019, 2019, 2019, 2018, 2019, 2019, 2019, 2019,
       2018, 2019, 2019, 2019, 2019, 2020, 2019, 2020, 2019, 2019, 2020,
       2019, 2020, 2019, 2019, 2019, 2020, 2019, 2019, 2020, 2020, 2019,
       2019, 2020, 2019, 2019, 2017, 2019, 2019, 2019, 2019, 2020, 2019,
       2020, 2019, 2018, 2019, 2019, 2020, 2020, 2019, 2018, 2018, 2017,
       2020, 2019, 2018, 2019, 2019, 2020, 2019, 2019, 2019, 2019, 2019,
       2019, 2020, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019,
       2019, 2020, 2019, 2019, 2020, 2021, 2019, 2019, 2019, 2019, 2019,
       2018, 2018, 2019, 2017, 2019, 2019, 2019, 2019, 2019, 2019, 2019,
       2018, 2018, 2019, 2019, 2018, 2017, 2019, 2019, 2019, 2019, 2020,
       2019, 2020, 2019, 2019, 2019, 2019, 2019, 2020, 2020, 2019, 2019,
       2019, 2018, 2019, 2019, 2019, 2020, 2018, 2019, 2019, 2020, 2019,
       2019, 2018, 2019, 2018, 2019, 2018, 2018, 2019, 2019, 2019, 2019,
       2019, 2019, 2019, 2019, 2020, 2020, 2020, 2019, 2019, 2019, 2019,
       2018, 2020, 2018, 2020, 2019, 2019], dtype=int64)
In [33]:
from sklearn.metrics import accuracy_score
print('model accuracy score with 10 decision-tree : {0:0.4f}' . format(accuracy_score(y_test, y_pred)))
model accuracy score with 10 decision-tree : 0.2850
In [ ]: